#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_MMX

/* a definir pour avoir exactement le meme resultat que la fonction C
 * (un chouillat plus lent).. mais la difference est assez peu notable.
 */
// #define STRICT_COMPAT

#define BUFFPOINTNB 16
#define BUFFPOINTMASK 0xffff
#define BUFFINCR 0xff

#define sqrtperte 16
/* faire : a % sqrtperte <=> a & pertemask*/
#define PERTEMASK 0xf
/* faire : a / sqrtperte <=> a >> PERTEDEC*/
#define PERTEDEC 4


/*#define MMX_TRACE*/
#include "mmx.h"
/*#include "xmmx.h"*/
#include "goom_graphic.h"

int xmmx_supported (void) {
	return (mm_support()&0x8)>>3;
}

void zoom_filter_xmmx (int prevX, int prevY,
                       Pixel *expix1, Pixel *expix2,
                       int *lbruS, int *lbruD, int buffratio,
                       int precalCoef[16][16])
{
	int bufsize = prevX * prevY; /* taille du buffer */
	volatile int loop;                    /* variable de boucle */

	mmx_t *brutS = (mmx_t*)lbruS; /* buffer de transformation source */
	mmx_t *brutD = (mmx_t*)lbruD; /* buffer de transformation dest */

	volatile mmx_t prevXY;
	volatile mmx_t ratiox;
	/*	volatile mmx_t interpix; */

	expix1[0].val=expix1[prevX-1].val=expix1[prevX*prevY-1].val=expix1[prevX*prevY-prevX].val=0;

	prevXY.ud[0] = (prevX-1)<<PERTEDEC;
	prevXY.ud[1] = (prevY-1)<<PERTEDEC;

	ratiox.d[0] = buffratio;
	ratiox.d[1] = buffratio;

  __asm__ __volatile__ (
    "\n\t movq  %0,    %%mm6"
    "\n\t pslld $16,   %%mm6" /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
    "\n\t pxor  %%mm7, %%mm7" /* mm7 = 0 */
    ::"m"(ratiox));

    loop = 0;

    /*
     * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
     */
    while (loop < bufsize) {
      /* Thread #1
       * pre :  mm6 = [rat16|rat16]
       * post : mm0 = S + ((D-S)*rat16 format [X|Y]
       * modified = mm0,mm1,mm2
       */

      __asm__ __volatile__ (
        "#1 \n\t movq       %0, %%mm0"
        "#1 \n\t movq       %1, %%mm1"
        "#1 \n\t psubd   %%mm0, %%mm1" /* mm1 = D - S */
        "#1 \n\t movq    %%mm1, %%mm2" /* mm2 = D - S */
        "#1 \n\t pslld     $16, %%mm1"
        "#1 \n\t pmullw  %%mm6, %%mm2"
        "#1 \n\t pmulhuw %%mm6, %%mm1"

        "#1 \n\t pslld   $16,   %%mm0"
        "#1 \n\t paddd   %%mm2, %%mm1" /* mm1 = (D - S) * buffratio >> 16 */

        "#1 \n\t paddd   %%mm1, %%mm0" /* mm0 = S + mm1 */
        "#1 \n\t psrld   $16,   %%mm0"
        :
        : "m" (brutS[loop]), "m" (brutD[loop])
      ); /* mm0 = S */

      /*
       * pre : mm0 : position vector on screen
       *       prevXY : coordinate of the lower-right point on screen
       * post : clipped mm0
       * modified : mm0,mm1,mm2
       */
      __asm__ __volatile__ (
        "#1 \n\t movq       %0, %%mm1"
        "#1 \n\t pcmpgtd %%mm0, %%mm1"
        /* mm0 en X contient (idem pour Y) :
         *   1111 si prevXY > px
         *   0000 si prevXY <= px */
#ifdef STRICT_COMPAT
        "#1 \n\t movq      %%mm1, %%mm2"
        "#1 \n\t punpckhdq %%mm2, %%mm2"
        "#1 \n\t punpckldq %%mm1, %%mm1"
        "#1 \n\t pand      %%mm2, %%mm0"
#endif
        "#1 \n\t pand      %%mm1, %%mm0" /* on met a zero la partie qui deborde */
        :
        : "m" (prevXY)
      );

      /* Thread #2
       * pre :  mm0 : clipped position on screen
       *
       * post : mm3 : coefs for this position
       *        mm1 : X vector [0|X]
       *
       * modif : eax,esi
       */
      __asm__ __volatile__ (
        "#2 \n\t movd  %%mm0, %%esi"
        "#2 \n\t movq  %%mm0, %%mm1"

        "#2 \n\t andl  $15,   %%esi"
        "#2 \n\t psrlq $32,   %%mm1"

        "#2 \n\t shll  $6,    %%esi"
        "#2 \n\t movd  %%mm1, %%eax"
#if defined(ARCH_X86_64)
        "#2 \n\t addq  %0,    %%rsi"
        "#2 \n\t andl  $15,   %%eax"
        "#2 \n\t movd (%%rsi,%%rax,4), %%mm3"
#elif defined(ARCH_X86_X32)
        "#2 \n\t addl  %0,    %%esi"
        "#2 \n\t andl  $15,   %%eax"
        "#2 \n\t movd (%%rsi,%%rax,4), %%mm3"
#else
        "#2 \n\t addl  %0,    %%esi"
        "#2 \n\t andl  $15,   %%eax"
        "#2 \n\t movd (%%esi,%%eax,4), %%mm3"
#endif
        :
        : "g" (precalCoef)
#if defined(ARCH_X86_64) || defined(ARCH_X86_X32)
        : "rax", "rsi"
#else
        : "eax", "esi"
#endif
      );

      /*
       * extraction des coefficients... (Thread #3)
       *
       * pre : coef dans mm3
       *
       * post : coef extraits dans mm3 (c1 & c2)
       *                        et mm4 (c3 & c4)
       *
       * modif : mm5
       */

      /* (Thread #4)
       * pre : mm0 : Y pos [*|Y]
       *       mm1 : X pos [*|X]
       *
       * post : mm0 : expix1[position]
       *        mm2 : expix1[position+largeur]
       *
       * modif : eax, esi
       */
      __asm__ __volatile__ (
        "#2 \n\t psrld     $4,    %%mm0"
        "#2 \n\t psrld     $4,    %%mm1" /* PERTEDEC = $4 */

        "#4 \n\t movd      %%mm1, %%eax"
        "#3 \n\t movq      %%mm3, %%mm5" 

        "#4 \n\t imull     %1,    %%eax"
        "#4 \n\t movd      %%mm0, %%esi"

        "#3 \n\t punpcklbw %%mm5, %%mm3"
        "#4 \n\t addl      %%esi, %%eax"

        "#3 \n\t movq      %%mm3, %%mm4"
        "#3 \n\t movq      %%mm3, %%mm5"
#if defined(ARCH_X86_64)
        "#4 \n\t movq      %0,    %%rsi"
        "#3 \n\t punpcklbw %%mm5, %%mm3"
        "#4 \n\t movq      (%%rsi,%%rax,4), %%mm0"
        "#3 \n\t punpckhbw %%mm5, %%mm4"
        "#4 \n\t addl      %1,    %%eax"
        "#4 \n\t movq      (%%rsi,%%rax,4), %%mm2"
#elif defined(ARCH_X86_X32)
        "#4 \n\t movl      %0,    %%esi"
        "#3 \n\t punpcklbw %%mm5, %%mm3"
        "#4 \n\t movq      (%%rsi,%%rax,4), %%mm0"
        "#3 \n\t punpckhbw %%mm5, %%mm4"
        "#4 \n\t addl      %1,    %%eax"
        "#4 \n\t movq      (%%rsi,%%rax,4), %%mm2"
#else
        "#4 \n\t movl      %0,    %%esi"
        "#3 \n\t punpcklbw %%mm5, %%mm3"
        "#4 \n\t movq      (%%esi,%%eax,4), %%mm0"
        "#3 \n\t punpckhbw %%mm5, %%mm4"
        "#4 \n\t addl      %1,    %%eax"
        "#4 \n\t movq      (%%esi,%%eax,4), %%mm2"
#endif
        :
        : "g" (expix1), "g" (prevX)
#if defined(ARCH_X86_64) || defined(ARCH_X86_X32)
        : "rax", "rsi"
#else
        : "eax", "esi"
#endif
      );

		/*
		 * pre :       mm0 : expix1[position]
		 *             mm2 : expix1[position+largeur]
		 *       mm3 & mm4 : coefs
		 */

		/* recopie des deux premiers pixels dans mm0 et mm1 */
		movq_r2r (mm0, mm1);            /* b1-v1-r1-a1-b2-v2-r2-a2 */

		/* depackage du premier pixel */
		punpcklbw_r2r (mm7, mm0);       /* 00-b2-00-v2-00-r2-00-a2 */

		/* extraction des coefficients... */

		movq_r2r (mm3, mm5);      /* c2-c2-c2-c2-c1-c1-c1-c1 */

		/*^en parrallele^*/ /* depackage du 2ieme pixel */
		/*^*/ punpckhbw_r2r (mm7, mm1); /* 00-b1-00-v1-00-r1-00-a1 */

		punpcklbw_r2r (mm7, mm5);	/* 00-c1-00-c1-00-c1-00-c1 */
		punpckhbw_r2r (mm7, mm3);	/* 00-c2-00-c2-00-c2-00-c2 */

		/* multiplication des pixels par les coefficients */
		pmullw_r2r (mm5, mm0);		/* c1*b2-c1*v2-c1*r2-c1*a2 */
		pmullw_r2r (mm3, mm1);		/* c2*b1-c2*v1-c2*r1-c2*a1 */
		paddw_r2r (mm1, mm0);

		/* ...extraction des 2 derniers coefficients */
		movq_r2r (mm4, mm5);			/* c4-c4-c4-c4-c3-c3-c3-c3 */
		punpcklbw_r2r (mm7, mm4);	/* 00-c3-00-c3-00-c3-00-c3 */
		punpckhbw_r2r (mm7, mm5);	/* 00-c4-00-c4-00-c4-00-c4 */

		/* recuperation des 2 derniers pixels */
		movq_r2r (mm2, mm1);

		/* depackage des pixels */
		punpcklbw_r2r (mm7, mm1);
		punpckhbw_r2r (mm7, mm2);

		/* multiplication pas les coeffs */
		pmullw_r2r (mm4, mm1);
		pmullw_r2r (mm5, mm2);

		/* ajout des valeurs obtenues à la valeur finale */
		paddw_r2r (mm1, mm0);
		paddw_r2r (mm2, mm0);

		/* division par 256 = 16+16+16+16, puis repackage du pixel final */
		psrlw_i2r (8, mm0);
		packuswb_r2r (mm7, mm0);

		movd_r2m (mm0,expix2[loop]);

		++loop;
	}
/*#ifdef HAVE_ATHLON*/
	__asm__ __volatile__ ("emms\n");
/*#else
	emms();
#endif*/
}

#define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
{ \
	movd_m2r(_backbuf, mm0); \
	paddusb_m2r(_col, mm0); \
	movd_r2m(mm0, _out); \
}

#define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)

void draw_line_xmmx (Pixel *data, int x1, int y1, int x2, int y2, int col, int screenx, int screeny)
{
	int x, y, dx, dy, yy, xx;
	Pixel *p;

	if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny) || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
		goto end_of_line;

	dx = x2 - x1;
	dy = y2 - y1;
	if (x1 >= x2) {
		int tmp;

		tmp = x1;
		x1 = x2;
		x2 = tmp;
		tmp = y1;
		y1 = y2;
		y2 = tmp;
		dx = x2 - x1;
		dy = y2 - y1;
	}

	/* vertical line */
	if (dx == 0) {
		if (y1 < y2) {
			p = &(data[(screenx * y1) + x1]);
			for (y = y1; y <= y2; y++) {
				DRAWMETHOD;
				p += screenx;
			}
		}
		else {
			p = &(data[(screenx * y2) + x1]);
			for (y = y2; y <= y1; y++) {
				DRAWMETHOD;
				p += screenx;
			}
		}
		goto end_of_line;
	}
	/* horizontal line */
	if (dy == 0) {
		if (x1 < x2) {
			p = &(data[(screenx * y1) + x1]);
			for (x = x1; x <= x2; x++) {
				DRAWMETHOD;
				p++;
			}
			goto end_of_line;
		}
		else {
			p = &(data[(screenx * y1) + x2]);
			for (x = x2; x <= x1; x++) {
				DRAWMETHOD;
				p++;
			}
			goto end_of_line;
		}
	}
	/* 1    */
	/*  \   */
	/*   \  */
	/*    2 */
	if (y2 > y1) {
		/* steep */
		if (dy > dx) {
			dx = ((dx << 16) / dy);
			x = x1 << 16;
			for (y = y1; y <= y2; y++) {
				xx = x >> 16;
				p = &(data[(screenx * y) + xx]);
				DRAWMETHOD;
#if 0
				if (xx < (screenx - 1)) {
					p++;
					/* DRAWMETHOD; */
				}
#endif
				x += dx;
			}
			goto end_of_line;
		}
		/* shallow */
		else {
			dy = ((dy << 16) / dx);
			y = y1 << 16;
			for (x = x1; x <= x2; x++) {
				yy = y >> 16;
				p = &(data[(screenx * yy) + x]);
				DRAWMETHOD;
#if 0
				if (yy < (screeny - 1)) {
					p += screeny;
					/* DRAWMETHOD; */
				}
#endif
				y += dy;
			}
		}
	}
	/*    2 */
	/*   /  */
	/*  /   */
	/* 1    */
	else {
		/* steep */
		if (-dy > dx) {
			dx = ((dx << 16) / -dy);
			x = (x1 + 1) << 16;
			for (y = y1; y >= y2; y--) {
				xx = x >> 16;
				p = &(data[(screenx * y) + xx]);
				DRAWMETHOD;
#if 0
				if (xx < (screenx - 1)) {
					p--;
					/* DRAWMETHOD; */
				}
#endif
				x += dx;
			}
			goto end_of_line;
		}
		/* shallow */
		else {
			dy = ((dy << 16) / dx);
			y = y1 << 16;
			for (x = x1; x <= x2; x++) {
				yy = y >> 16;
				p = &(data[(screenx * yy) + x]);
				DRAWMETHOD;
#if 0
				if (yy < (screeny - 1)) {
					p += screeny;
					/* DRAWMETHOD; */
				}
#endif
				y += dy;
			}
			goto end_of_line;
		}
	}
end_of_line:
	__asm__ __volatile__ ("emms\n"); 
}

#endif
